In [1]:
import pandas as pd
In [2]:
sales = pd.DataFrame(
{
'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],
'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],
'bread': [139, 237, 326, 456],
'butter': [20, 45, 70, 98]
}
)
In [3]:
sales.groupby('weekday').count()
Out[3]:
In [4]:
sales.groupby('weekday')['city'].count()
Out[4]:
In [5]:
sales.groupby('weekday')[['city','butter']].count()
Out[5]:
In [6]:
sales.groupby(['weekday','city'])['butter'].mean()
Out[6]:
In [7]:
#Groupby by custom series
customers = pd.Series(['Dave','Alice','Bob','Alice'])
sales.groupby(customers)['bread'].sum()
Out[7]:
In [8]:
#Groupby is more efficient (less memory and more speed) with categorical column
sales.dtypes
Out[8]:
In [9]:
sales['weekday'] = sales['weekday'].astype('category')
sales.dtypes
Out[9]:
In [10]:
sales.groupby('weekday').sum()
Out[10]:
In [11]:
sales.groupby('city')[['bread','butter']].agg(['max','sum'])
Out[11]:
In [12]:
# custom aggregation
def data_range(series):
return series.max() - series.min()
sales.groupby('weekday')[['bread', 'butter']].agg(data_range)
Out[12]:
In [13]:
sales.groupby(customers)[['bread', 'butter']].agg({'bread':'sum', 'butter':data_range})
Out[13]:
In [14]:
def zscore(series):
return (series - series.mean()) / series.std()
In [15]:
#!pip install pydataset
from pydataset import data
In [16]:
boston = data('Boston')
boston['id']=boston.index
boston.head()
Out[16]:
In [27]:
zscore(boston['rm']).head()
Out[27]:
In [28]:
len(zscore(boston['rm']))
Out[28]:
In [29]:
boston.groupby('rad')['rm'].transform(zscore).head()
Out[29]:
In [30]:
len(boston.groupby('rad')['rm'].transform(zscore))
Out[30]:
In [31]:
def zscore_with_indus_and_id(group):
df = pd.DataFrame(
{'rm': zscore(group['rm']),
'rad': group['rad'],
'id': group['id']})
return df
In [33]:
boston.groupby('rad').apply(zscore_with_indus_and_id).head()
Out[33]:
In [34]:
splitting = boston.groupby('rad')
print(type(splitting))
print(type(splitting.groups))
print(splitting.groups.keys())
In [38]:
splitting.groups
Out[38]:
In [42]:
### Group object iteration
for group_name, group in splitting:
avg= group['age'].mean()
print(group_name,avg)
In [43]:
boston.groupby('rad')['age'].mean()
Out[43]:
In [57]:
for group_name, group in splitting:
avg= group.loc[group['zn']==0,'age'].mean()
print(group_name,avg)
In [60]:
{zn: group.loc[group['zn']==0,'age'].mean() for zn, group in splitting}
Out[60]:
In [65]:
filt= boston['zn']==0
res=boston.groupby(['rad',filt])['age'].mean()
res
Out[65]:
In [69]:
res.unstack(-1)
Out[69]:
In [ ]: